# -*- coding: utf-8 -*-

import json
from gensim import corpora, models
from src.utils import mkdirs, cut

def run(input_file, output_dir, stopword_path):
	with open(stopword_path) as f:
		stopwords = [line.strip() for line in f.readlines()]

	# texts = []
	# for file in os.listdir(input_dir):
	# 	with open(input_dir + file) as f: texts.append(f.read())

	# split_texts = [cut(text, stopwords) for text in texts]

	split_texts = [cut(text, stopwords) for text in get_texts(input_file)]
	# split_texts = get_texts(input_file)

	mkdirs(output_dir)

	dictionary = corpora.Dictionary(split_texts)
	dpath = output_dir + 'plan.dict'
	# logging.info('Save dictionary to:[%s].' % dpath)
	dictionary.save(dpath)

	# 将语料库以稀疏坐标矩阵集存储至硬盘
	corpus_vector = [dictionary.doc2bow(text) for text in split_texts]
	cpath = output_dir + 'plan.corpus'
	# logging.info('Save corpus to:[%s].' % cpath)
	corpora.MmCorpus.serialize(cpath, corpus_vector)

	tfidf_model = models.TfidfModel(corpus_vector)
	mpath = output_dir + 'plan.model'
	# logging.info('Save model to:[%s].' % mpath)
	tfidf_model.save(mpath)

def get_texts(input_file):
	with open(input_file, 'r') as f: contents = json.loads(f.read())

	texts = []
	for it in contents: texts.append('\n'.join(it.get('content')))

	return texts
